Chapter 3 Data statistics
3.1 filter samples with high host data
sample_metadata <- sample_metadata%>%
filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))
read_counts <- read_counts %>%
select(one_of(c("genome",sample_metadata$sample)))%>%
filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
select_if(~!all(. == 0))
genome_metadata <- genome_metadata %>%
semi_join(., read_counts, by = "genome") %>%
arrange(match(genome,read_counts1$genome))
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips
#load("data/genome_gifts.Rdata")3.2 Sequencing reads statistics
sample_metadata %>%
summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2),
mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
tt()| Total | Average |
|---|---|
| 222.06 | 4.93 ± 1.08 |
3.3 DNA fractions
sequence_fractions <- read_counts %>%
pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
group_by(sample) %>%
summarise(mags = sum(value)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
mutate(mags_bases = mags*146) %>%
mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
tt()| Sample | Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|---|
| EHI01861 | 0.2815721 | 0.011886522 | 1.2899665 | 2.94492935 |
| EHI01963 | 0.2379339 | 0.377956197 | 0.7686364 | 2.26737109 |
| EHI01964 | 0.3498799 | 2.739970327 | 0.5635412 | 1.32056898 |
| EHI01965 | 0.3583169 | 0.398887483 | 1.3439432 | 3.82431144 |
| EHI01969 | 0.2444974 | 0.026139904 | 0.7635264 | 2.53338864 |
| EHI01971 | 0.3382550 | 0.014993899 | 1.0255034 | 3.94601821 |
| EHI01972 | 0.4703327 | 0.054667824 | 1.3684974 | 5.49040300 |
| EHI01973 | 0.3347793 | 0.064729480 | 1.5130592 | 4.84065102 |
| EHI01974 | 0.3671355 | 0.052747090 | 0.8921925 | 2.95848005 |
| EHI01975 | 0.4305111 | 0.123580711 | 1.9003492 | 5.31791948 |
| EHI01976 | 0.3856202 | 0.011574334 | 1.2666400 | 3.26276449 |
| EHI01977 | 0.4158680 | 0.747098387 | 1.7867623 | 3.87494103 |
| EHI01978 | 0.3929906 | 0.004864216 | 1.2394723 | 3.63470402 |
| EHI01980 | 0.3711274 | 0.013798033 | 1.7409090 | 5.56779249 |
| EHI01981 | 0.3633649 | 0.020488667 | 1.2293197 | 3.03525532 |
| EHI01983 | 0.2418940 | 0.146776195 | 0.8284920 | 2.84914941 |
| EHI01984 | 0.4431283 | 0.007477209 | 1.2629388 | 5.76052943 |
| EHI01986 | 0.3104557 | 0.016788774 | 0.9597449 | 3.38641569 |
| EHI01987 | 0.4292930 | 0.001120181 | 0.9160842 | 3.32218036 |
| EHI01988 | 0.2963428 | 0.197271999 | 1.0599471 | 3.92577283 |
| EHI01990 | 0.3186345 | 0.010910854 | 1.0992821 | 3.87615385 |
| EHI01991 | 0.3161992 | 0.021159611 | 1.1871737 | 2.82147205 |
| EHI01993 | 0.2026808 | 0.006054774 | 0.5074436 | 2.59915566 |
| EHI01994 | 0.3359364 | 0.064796021 | 0.9475828 | 3.88007790 |
| EHI02699 | 0.2717760 | 0.007940738 | 1.1803471 | 3.05661322 |
| EHI02700 | 0.3187046 | 3.815365599 | 0.7469281 | 0.43060671 |
| EHI02701 | 0.2059134 | 0.135887208 | 0.9201707 | 2.92496093 |
| EHI02705 | 0.2326028 | 0.134644975 | 1.0103929 | 3.05008177 |
| EHI02707 | 0.2133975 | 0.197419011 | 1.0367591 | 2.56043485 |
| EHI02708 | 0.3287951 | 0.247517459 | 1.4970876 | 3.72574889 |
| EHI02710 | 0.2905179 | 0.004863423 | 1.1544139 | 3.25985646 |
| EHI02712 | 0.3735243 | 4.162982418 | 0.5973263 | 0.12825939 |
| EHI02714 | 0.2229262 | 0.166057029 | 0.7697399 | 3.24492767 |
| EHI02715 | 0.3350685 | 3.050470506 | 0.9265001 | 0.43417816 |
| EHI02718 | 0.2124597 | 1.622070570 | 0.4994065 | 1.33393017 |
| EHI02719 | 0.3330425 | 2.607725990 | 1.8321792 | 0.33907536 |
| EHI02720 | 0.3031357 | 3.089773324 | 0.7364466 | 0.19044999 |
| EHI02721 | 0.3228625 | 3.677630914 | 0.6704712 | 0.08601152 |
| EHI02722 | 0.3095520 | 2.799929187 | 0.7276464 | 0.41217450 |
| EHI02723 | 0.2730593 | 0.113227422 | 0.9750562 | 3.21338043 |
| EHI02724 | 0.3551211 | 0.121469083 | 0.9809314 | 3.69454095 |
| EHI02728 | 0.3109121 | 1.043367740 | 0.9818139 | 2.94984474 |
| EHI02729 | 0.3043389 | 2.024187975 | 1.5671125 | 0.47668212 |
| EHI02749 | 0.4389880 | 4.177705067 | 0.7870159 | 0.53921786 |
| EHI02751 | 0.4202354 | 3.712236730 | 0.7804933 | 1.77651793 |
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
tt()| Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|
| 0.3247485 | 0.9344047 | 1.063094 | 2.779287 |
sequence_fractions %>%
pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
mutate(value = value / 1000000000) %>%
mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
ggplot(., aes(x = sample, y = value, fill=fraction)) +
geom_bar(position="stack", stat = "identity") +
scale_fill_manual(name="Sequence type",
breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")3.4 Recovered microbial fraction
singlem_table <- sequence_fractions %>%
mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
select(sample,mags_proportion,singlem_proportion) %>%
mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify
singlem_table %>%
pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
ggplot(., aes(x = value, y = sample, color=proportion)) +
geom_line(aes(group = sample), color = "#f8a538") +
geom_point() +
scale_color_manual(name="Proportion",
breaks=c("mags_proportion","singlem_proportion"),
labels=c("Recovered","Estimated"),
values=c("#52e1e8", "#876b53"))+
theme_classic() +
labs(y = "Samples", x = "Prokaryotic fraction (%)") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")damr <- singlem_table %>%
mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
select(sample,damr)
damr %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
filter(diet=="Wild")%>%
summarise(mean=mean(damr),sd=sd(damr)) %>%
tt()| mean | sd |
|---|---|
| 53.68476 | 25.59637 |
damr %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
filter(region=="Nafarroa")%>%
summarise(mean=mean(damr),sd=sd(damr)) %>%
tt()| mean | sd |
|---|---|
| 75.85542 | 4.081913 |
| sample | damr |
|---|---|
| EHI01861 | 69.54 |
| EHI01963 | 74.68 |
| EHI01964 | 70.09 |
| EHI01965 | 74.00 |
| EHI01969 | 76.84 |
| EHI01971 | 79.37 |
| EHI01972 | 80.05 |
| EHI01973 | 76.19 |
| EHI01974 | 76.83 |
| EHI01975 | 73.67 |
| EHI01976 | 72.04 |
| EHI01977 | 68.44 |
| EHI01978 | 74.57 |
| EHI01980 | 76.18 |
| EHI01981 | 71.17 |
| EHI01983 | 77.47 |
| EHI01984 | 82.02 |
| EHI01986 | 77.92 |
| EHI01987 | 78.39 |
| EHI01988 | 78.74 |
| EHI01990 | 77.91 |
| EHI01991 | 70.38 |
| EHI01993 | 83.67 |
| EHI01994 | 80.37 |
| EHI02699 | 72.14 |
| EHI02700 | 36.57 |
| EHI02701 | 76.07 |
| EHI02705 | 75.12 |
| EHI02707 | 71.18 |
| EHI02708 | 71.34 |
| EHI02710 | 73.85 |
| EHI02712 | 17.68 |
| EHI02714 | 80.83 |
| EHI02715 | 31.91 |
| EHI02718 | 72.76 |
| EHI02719 | 15.62 |
| EHI02720 | 20.55 |
| EHI02721 | 11.37 |
| EHI02722 | 36.16 |
| EHI02723 | 76.72 |
| EHI02724 | 79.02 |
| EHI02728 | 75.03 |
| EHI02729 | 23.32 |
| EHI02749 | 40.66 |
| EHI02751 | 69.48 |
[1] 0
| mean | sd |
|---|---|
| 65.50911 | 20.77582 |